*** LIS Cross-section Data center in Luxembourg

* email: usersupport@lisdatacenter.org 

*** LIS Self Teaching Package 2022
*** Part II: Gender, employment, and wages
*** Stata version

* last change of this version of the syntax: 15-01-2022.

/* The exercises in Part II emphasises the use of person-level data, including wages, 
demographics, and labour market information. Building-up on the techniques presented 
in Part I, they introduce regression modelling and continue to lead you through 
the process of developing a comparative analysis on inequality and poverty across countries.*/


** Exercise 1: Merging person and household data, selecting a sample

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
recode own (100/199=1) (200/299=0), gen(homeowner)
sum homeowner [aw=ppopwgt]
}


** Exercise 2: Stacking data, employment rates by gender

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program define get_descriptives
bysort dname: tabulate emp [aw=ppopwgt] if sex==2
tabulate dname ptime1 [aw=ppopwgt] if emp==1 & sex==2, row nofreq
end

quietly make_data
use ${mydata}exercise2_LIS, clear
get_descriptives


** Exercise 3: Family structure and employment

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program drop make_data
program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program define recode_data
recode ageyoch (. 18/max= 0 "no children <18") (0/5 = 1 "<6 years") (6/17 = 2 "6-17 years"), gen(achildcat)
label var achildcat "Lowest age of own children"
end

program drop get_descriptives
program define get_descriptives
table dname achildcat partner [aw=ppopwgt] if sex==2, contents(mean emp) format(%9.3f)
end

*quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_descriptives


** Exercise 4: Dependent employment and hourly wages

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program drop make_data
program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program drop recode_data
program define recode_data
recode status1 (100/120=1) (200/240=0) (else=.), gen(depemp)
label define depempl 0 "not in dependent employment" 1 "in dependent employment"
label values depemp depempl
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=ppopwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
}
end

program drop get_descriptives
program define get_descriptives
bysort dname: tab sex depemp [aw=ppopwgt] if emp==1, row nofreq
bysort dname sex: sum hourwage [aw=ppopwgt], de
end

*quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_descriptives


** Exercise 5: Hourly wages, education, and country-specific variables

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program drop make_data
program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program drop recode_data
program define recode_data
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=ppopwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
}
label values educ_c .
end

program drop get_descriptives
program define get_descriptives
bysort dname: tab educ_c educ, mi
foreach ccyy in $datasets {
foreach e in 1 2 3 {
quietly sum hourwage [aw=ppopwgt] if dname=="`ccyy'" & educ==`e' & sex==2, de
local fwage : di %9.2f r(p50)
quietly sum hourwage [aw=ppopwgt] if dname=="`ccyy'" & educ==`e' & sex==1, de
local wageratio : di %9.2f `fwage'/r(p50)
di "`ccyy' educ=`e': `wageratio'"
}
}
end

*quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_descriptives


** Exercise 6: Immigration and wages, understanding harmonisation

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program drop make_data
program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program drop recode_data
program define recode_data
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=ppopwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
}
label values educ_c .
end

program drop get_descriptives
program define get_descriptives
foreach ccyy in $datasets {
foreach i in 0 1 {
quietly sum hourwage [aw=ppopwgt] if dname=="`ccyy'" & immigr==`i' & sex==2, de
local fwage : di %9.2f r(p50)
quietly sum hourwage [aw=ppopwgt] if dname=="`ccyy'" & immigr==`i' & sex==1, de
local wageratio : di %9.2f `fwage'/r(p50)
di "`ccyy' immigr=`i': `wageratio'"
}
}
end

*quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_descriptives


** Exercise 7: Wage regressions

global varshh "hid own" 
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1" 
global datasets "us04 be04 gr04" 

program drop make_data
program define make_data 
foreach ccyy in $datasets { 
use $varspp using $`ccyy'p, clear 
merge m:1 hid using $`ccyy'h, keepusing($varshh) 
keep if inrange(age,25,54) & relation<=2200 
if "`ccyy'" != "us04" { 
append using ${mydata}exercise2_LIS 
} 
save ${mydata}exercise2_LIS, replace 
} 
end
 
program drop recode_data
program define recode_data 
recode own (100/199=1) (200/299=0), gen(homeowner) 
recode ageyoch (. 18/max = 0 "no children <18") (0/5 = 1 "<6 years") (6/17 = 2 "6-17   years"), gen(achildcat)
label var achildcat "Lowest age of own children"  
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=ppopwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
} 
label values educ_c . 
gen logwage = ln(hourwage) 
gen agesq=age^2 
recode achildcat (1=1) (0 2=0) (else=.), gen(youngchild) 
recode achildcat (2=1) (0 1=0) (else=.), gen(oldchild) 
recode educ (2=1) (1 3=0) (else=.), gen(mededuc) 
recode educ (3=1) (1 2=0) (else=.), gen(hieduc) 
end 

program define get_estimates 
foreach g in 1 2 { 
foreach ccyy in $datasets { 
quietly eststo: regress logwage age agesq mededuc hieduc immigr partner youngchild oldchild ptime1 homeowner [pw=ppopwgt] if sex==`g' & dname=="`ccyy'", vce(robust) 
} 
esttab, b(a2) se(a2) r2(a3) mtitles($datasets) 
eststo clear 
} 
end 
 
*quietly make_data 
use ${mydata}exercise2_LIS, clear 
quietly recode_data 
get_estimates
 
 
** Exercise 8: Pooled regressions and normalised weights

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program drop make_data
program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program drop recode_data
program define recode_data
recode own (100/199=1) (200/299=0), gen(homeowner)
recode ageyoch (. 18/max= 0 "no children <18") (0/5 = 1 "<6 years") (6/17 = 2 "6-17 years"), gen(achildcat)
label var achildcat "Lowest age of own children"
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=pwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
}
label values educ_c .
gen logwage = ln(hourwage)
gen agesq=age^2
recode achildcat (1=1) (0 2=0) (else=.), gen(youngchild)
recode achildcat (2=1) (0 1=0) (else=.), gen(oldchild)
recode educ (2=1) (1 3=0) (else=.), gen(mededuc)
recode educ (3=1) (1 2=0) (else=.), gen(hieduc)

gen ppp = .
replace ppp=0.86 if dname=="be04"
replace ppp=0.65 if dname=="gr04"
replace ppp=1 if dname=="us04"
gen hourwage_ppp = hourwage/ppp
gen logwage_ppp = log(hourwage_ppp)
gen belgium=0
replace belgium=1 if dname=="be04"
gen greece=0
replace greece=1 if dname=="gr04"
end

program drop get_estimates
program define get_estimates
quietly eststo: regress logwage_ppp age agesq mededuc hieduc immigr partner youngchild oldchild ptime1 homeowner belgium greece [pw=pwgt] if sex==1, vce(robust)
quietly eststo: regress logwage_ppp age agesq mededuc hieduc immigr partner youngchild oldchild ptime1 homeowner belgium greece [pw=pwgt] if sex==2, vce(robust)
esttab, b(a2) se(a2) r2(a3) mtitles(Men Women)
end

*quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_estimates